Problem Statement
Crime data for years 2012 until 2016 from Atlanta Police Department (APD) is provided. Explore the data set and visualize the findings. Build a prediction model using the 2012-2014 data and predict number of crimes in 2015-2016.
Useful Features
Target
import numpy as np
import pandas as pd
import seaborn as sns
import missingno as mn
import folium
from folium import plugins
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
import keras
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from xgboost import XGBRegressor
import matplotlib.pylab as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
np.random.seed(42)
dfs = {}
for year in range(2012, 2017):
dfs[str(year)] = pd.read_excel("./data/apd/COBRA-YTD{}.xlsx".format(year), sheet_name='CRIME_DATA')
df2012, df2013, df2014, df2015, df2016 = dfs.values()
# one dateframe with crimes from 2012 to 2016
df = pd.concat(dfs.values(), ignore_index=True)
df.shape
pd.set_option('display.max_columns', None)
df.head()
df.info()
df.describe()
# overview of missing data
mn.matrix(df)
plt.show()
# missing data pecentage
def missing(df):
'''take a dataframe DF and return a summary of the missing data for each column'''
total = df.isnull().sum()
percent = df.isnull().sum()/df.isnull().count()
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent']).sort_values("Total", ascending=False)
return missing_data
missing(df)[missing(df)['Total'] > 0]
# helper function to build a occur_date dataframe
def make_df_date(df):
'''Takes a dataframe DF, return a new dataframe DF_DATE with columns: year, month, day, weekday'''
df_date = pd.DataFrame()
date = pd.to_datetime(df['occur_date'])
df_date['date'] = date
df_date['year'] = date.dt.year
df_date['month'] = date.dt.month
df_date['day'] = date.dt.day
df_date['weekday'] = date.dt.weekday + 1 # Monday=1, Sunday=7
return df_date
def make_df_date_period(df):
'''
Takes a dataframe DF, return a new dataframe DF_DATE with columns: year, month, day, weekday.
As well as "sin_" and "cos_" to take the periodic behavior into consideration.
This can be useful for better model performance
'''
df_date = pd.DataFrame()
date = pd.to_datetime(df['occur_date'])
df_date['date'] = date
df_date['year'] = date.dt.year
df_date['month'] = date.dt.month
df_date['day'] = date.dt.day
df_date['weekday'] = date.dt.weekday + 1 # Monday=1, Sunday=7
df_date["sin_month"] = df_date["month"].apply(lambda x: np.sin(2*np.pi*x/12))
df_date["cos_month"] = df_date["month"].apply(lambda x: np.cos(2*np.pi*x/12))
days_in_month = [pd.Period(str(day)).days_in_month for day in date]
day_percent = df_date["day"] / days_in_month
df_date["sin_day"] = day_percent.apply(lambda x: np.sin(2*np.pi*x))
df_date["cos_day"] = day_percent.apply(lambda x: np.cos(2*np.pi*x))
df_date["sin_weekday"] = df_date["weekday"].apply(lambda x: np.sin(2*np.pi*x/7))
df_date["cos_weekday"] = df_date["weekday"].apply(lambda x: np.cos(2*np.pi*x/7))
return df_date
# helper fucntion to plot the distribution
def period_count(df, period, year=None, crime='all crimes', order=None):
'''
Count the number of crimes in dataframe DF and in frequency of PERIOD. Crime type is CRIME and x range is ORDER.
Return a histogram plot.
'''
plt.figure(figsize=(15,5))
if year:
plt.title("Number of crimes occured in a particular {} in {} for {}.".format(period, year, crime))
else:
plt.title("Number of crimes occured in a particular {} for {}.".format(period, crime))
sns.countplot(x=period, data=df, order=order)
plt.show()
df_date = make_df_date(df)
df_date_period = make_df_date_period(df)
# date dataframes for different years
dfs_date = {}
for year in range(2012, 2017):
dfs_date[str(year)] = make_df_date(dfs[str(year)])
df2012_date, df2013_date, df2014_date, df2015_date, df2016_date = dfs_date.values()
# number of crimes occured in different years while recorded in 2012
period_count(df2012_date, 'year')
# number of all crimes from 2012 to 2016
period_count(df_date, 'year', order=range(2012,2017))
period_count(df_date, 'month', year='2012-2016')
period_count(df_date, 'weekday', year='2012-2016')
plt.figure(figsize=(15,5))
df_date['date'][df_date['year']>=2012].value_counts().plot()
plt.xlabel('year')
plt.ylabel('count')
plt.title('Number of crimes per day')
plt.show()
# helper function to build a occur_time dataframe
def make_df_time(df):
'''Takes a dataframe DF, return a new dataframe DF_TIME with columns: hour, min_in_day'''
df_time = pd.DataFrame()
time = pd.to_datetime(df['occur_time'], format='%H:%M:%S')
df_time['hour'] = time.dt.hour
df_time['min_in_day'] = time.dt.hour * 60 + time.dt.minute
return df_time
def make_df_time_period(df):
'''
Takes a dataframe DF, return a new dataframe DF_TIME with columns: hour, min_in_day.
As well as "sin_min" and "cos_min" to take the periodic behavior into consideration.
This can be useful for better model performance
'''
df_time = pd.DataFrame()
time = pd.to_datetime(df['occur_time'], format='%H:%M:%S')
df_time['hour'] = time.dt.hour
df_time['min_in_day'] = time.dt.hour * 60 + time.dt.minute
df_time["sin_min"] = df_time['min_in_day'].apply(lambda x: np.sin(2*np.pi*x/1440))
df_time["cos_min"] = df_time['min_in_day'].apply(lambda x: np.cos(2*np.pi*x/1440))
return df_time
df_time = make_df_time(df)
df_time_period = make_df_time_period(df)
# time dataframes for different years
dfs_time = {}
for year in range(2012, 2017):
dfs_time[str(year)] = make_df_time(dfs[str(year)])
df2012_time, df2013_time, df2014_time, df2015_time, df2016_time = dfs_time.values()
period_count(df_time, 'hour', year='2012-2016')
# helper function to plot the distribution
def count_plot(df, fea, year='2012-2016', crime='all crimes'):
'''
Count the number of crimes in dataframe DF for column FEA. Crime type is CRIME.
Return a histogram plot. Only keeps the top 15 values in FEA.
'''
plt.figure(figsize=(15,5))
t_df = pd.DataFrame(df[fea].value_counts(dropna=False)).reset_index().fillna('Unknown')
t_df.columns = [fea, 'Count']
sns.barplot(x=fea, y='Count', data=t_df.head(15))
plt.xticks(rotation=60)
plt.title(fea + ' distribution in {} for {}'.format(year, crime))
plt.show()
count_plot(df, 'Shift')
le = LabelEncoder()
le.fit_transform(df['Shift'].values.reshape(-1,1))
# helper function to do label encoding
def label(df, fea):
'''Label encode the column FEA in dataframe DF. Return a dataframe T_DF.'''
le = LabelEncoder()
res = le.fit_transform(df[fea].fillna('Unk'))
t_df = pd.DataFrame(data={fea: res})
return t_df
# label encode shift
df_shift = label(df, 'Shift')
df_shift.head()
df['neighborhood'].unique().shape
count_plot(df, 'neighborhood') # only top 15 neighborhoods are shown
# label encode neighborhood
df_neighborhood = label(df, 'neighborhood')
df_neighborhood.head()
df['location'].unique().shape
count_plot(df, 'location') # only top 15 locations are shown
df['location'].value_counts().head(5).index
# label encode location
df_location = label(df, 'location')
df_location.head()
count_plot(df, 'UC2 Literal')
def hour_plots(df, fea='UC2 Literal'):
fig = plt.figure(figsize=(15,15))
ctypes = df[fea].value_counts().index
for i, crime_type in enumerate(ctypes):
ax = fig.add_subplot(int(np.ceil(float(len(ctypes)) / 3)), 3, i+1)
df_crime = df.groupby(fea).get_group(crime_type)
df_time = make_df_time(df_crime)
sns.countplot(x='hour', data=df_time, ax=ax)
ax.set_title(crime_type)
ax.set_xticklabels(sorted(df_time['hour'].unique()), rotation=90)
fig.tight_layout()
plt.show()
def neighbor_plots(df, fea='UC2 Literal'):
fig = plt.figure(figsize=(15,20))
ctypes = df[fea].value_counts().index
for i, crime_type in enumerate(ctypes):
ax = fig.add_subplot(int(np.ceil(float(len(ctypes)) / 3)), 3, i+1)
df_crime = df.groupby(fea).get_group(crime_type)
t_df = pd.DataFrame(df_crime['neighborhood'].value_counts(dropna=False)).reset_index().fillna('Unknown')
t_df.columns = ['neighborhood', 'Count']
sns.barplot(x='neighborhood', y='Count', data=t_df.head(15))
ax.set_xticklabels(t_df['neighborhood'], rotation=90)
ax.set_title(crime_type)
fig.tight_layout()
plt.show()
hour_plots(df)
neighbor_plots(df)
# group the dataframe by crime type
g_crime_type = df.groupby('UC2 Literal')
df_LFV = g_crime_type.get_group('LARCENY-FROM VEHICLE')
df_LFV_date = make_df_date(df_LFV)
df_LFV_time = make_df_time(df_LFV)
period_count(df_LFV_date, 'year', crime='LARCENY-FROM VEHICLE', order=range(2012,2017))
period_count(df_LFV_date, 'month', crime='LARCENY-FROM VEHICLE', year='2012-2016')
period_count(df_LFV_time, 'hour', crime='LARCENY-FROM VEHICLE', year='2012-2016')
count_plot(df_LFV, 'neighborhood', crime='LARCENY-FROM VEHICLE')
count_plot(df_LFV, 'location', crime='LARCENY-FROM VEHICLE')
df_LNV = g_crime_type.get_group('LARCENY-NON VEHICLE')
df_LNV_date = make_df_date(df_LNV)
df_LNV_time = make_df_time(df_LNV)
period_count(df_LNV_date, 'year', crime='LARCENY-NON VEHICLE', order=range(2012,2017))
period_count(df_LNV_date, 'month', crime='LARCENY-NON VEHICLE', year='2012-2016')
period_count(df_LNV_time, 'hour', crime='LARCENY-NON VEHICLE', year='2012-2016')
count_plot(df_LNV, 'neighborhood', crime='LARCENY-NON VEHICLE')
count_plot(df_LNV, 'location', crime='LARCENY-NON VEHICLE')
df_BR = g_crime_type.get_group('BURGLARY-RESIDENCE')
df_BR_date = make_df_date(df_BR)
df_BR_time = make_df_time(df_BR)
period_count(df_BR_date, 'year', crime='BURGLARY-RESIDENCE', order=range(2012,2017))
period_count(df_BR_date, 'month', crime='BURGLARY-RESIDENCE', year='2012-2016')
period_count(df_BR_time, 'hour', crime='BURGLARY-RESIDENCE', year='2012-2016')
count_plot(df_BR, 'neighborhood', crime='BURGLARY-RESIDENCE')
count_plot(df_BR, 'location', crime='BURGLARY-RESIDENCE')
df_AT = g_crime_type.get_group('AUTO THEFT')
df_AT_date = make_df_date(df_AT)
df_AT_time = make_df_time(df_AT)
period_count(df_AT_date, 'year', crime='AUTO THEFT', order=range(2012,2017))
period_count(df_AT_date, 'month', crime='AUTO THEFT', year='2012-2016')
period_count(df_AT_time, 'hour', crime='AUTO THEFT', year='2012-2016')
count_plot(df_AT, 'neighborhood', crime='AUTO THEFT')
count_plot(df_AT, 'location', crime='AUTO THEFT')
df_AA = g_crime_type.get_group('AGG ASSAULT')
df_AA_date = make_df_date(df_AA)
df_AA_time = make_df_time(df_AA)
period_count(df_AA_date, 'year', crime='AGG ASSAULT', order=range(2012,2017))
period_count(df_AA_date, 'month', crime='AGG ASSAULT', year='2012-2016')
period_count(df_AA_time, 'hour', crime='AGG ASSAULT', year='2012-2016')
count_plot(df_AA, 'neighborhood', crime='AGG ASSAULT')
count_plot(df_AA, 'location', crime='AGG ASSAULT')
df_crime_type = label(df, 'UC2 Literal')
df_crime_type.head()
alt_loc = [33.7490, -84.3880]
base_map = folium.Map(location=alt_loc, zoom_start=11)
heatmap = plugins.HeatMap(df[['y', 'x']], radius=3, blur=6)
base_map.add_child(heatmap)
# helper function to stack the heatmap of different crime types
def heat_maps(df, fea='UC2 Literal'):
fig = plt.figure(figsize=(15,15))
ctypes = df[fea].value_counts().index
for i, crime_type in enumerate(ctypes):
ax = fig.add_subplot(int(np.ceil(float(len(ctypes)) / 4)), 4, i+1)
crime_df = df.groupby(fea).get_group(crime_type)
sns.regplot('x', 'y', data=crime_df[['x','y']], fit_reg=False, scatter_kws={'alpha':0.5, 'color':'grey'}, ax=ax)
sns.kdeplot(X='x', Y='y', data=crime_df[['x','y']], cmap="jet", bw=.003, cbar=True, shade=True,
shade_lowest=False, ax = ax)
ax.set_title(crime_type)
ax.set_axis_off()
plt.show()
heat_maps(df2012)
g_crime = df.groupby('UC2 Literal')
df_rape = g_crime.get_group('RAPE')
df_rape[['y', 'x']].value_counts()
# helper function to stack the heatmap of all crimes, excluding RAPE
def heat_map_all(df, fea='UC2 Literal'):
fig = plt.figure(figsize=(12,12))
crime_df = df[df[fea] != 'RAPE']
sns.regplot('x', 'y', data=crime_df[['x','y']], fit_reg=False, scatter_kws={'alpha':0.5, 'color':'grey'})
sns.kdeplot(X='x', Y='y', data=crime_df[['x','y']], cmap="jet", bw=.003, cbar=True, shade=True, shade_lowest=False)
plt.show()
heat_map_all(df2012)
heat_maps(df2013)
heat_map_all(df2013)
heat_maps(df2014)
heat_map_all(df2014)
heat_maps(df2015)
heat_map_all(df2015)
heat_maps(df2016)
heat_map_all(df2016)
my_dfs = [df_date_period, df_time_period, df_shift, df_neighborhood, df_location, df_crime_type, df[['x', 'y']]]
xgb_df = pd.concat(my_dfs, axis=1)
xgb_df = xgb_df[xgb_df['year']>=2012].reset_index(drop=True)
xgb_df = xgb_df.sort_values('date').reset_index(drop=True)
xgb_df.head()
xgb_df.shape
temp_df = df_date[df_date['year'] >= 2012][['year']]
temp_df['crimes'] = 1
temp_df.groupby('year').sum()
check = {2012: 33352, 2013: 32252, 2014: 31159, 2015: 30092, 2016: 28932}
y = np.array([check[year] for year in xgb_df['year']])
y.shape
features = list(xgb_df.columns)
features.remove('date')
features.remove('year')
split = len(xgb_df[xgb_df['year'] < 2015])
train_y = np.asarray(y[:split])
train_X = xgb_df[features][:split]
test_y = np.asarray(y[split:])
test_X = xgb_df[features][split:]
model_xgb = XGBRegressor(random_state=42, n_jobs=-1, objective='reg:squarederror')
model_xgb = model_xgb.fit(train_X, train_y)
train_y_pred_xgb = model_xgb.predict(train_X)
test_y_pred_xgb = model_xgb.predict(test_X)
# MSE for train and test data
print("Train MSE: {:.4f}".format(mean_squared_error(train_y_pred_xgb, train_y)))
print("Test MSE: {:.4f}".format(mean_squared_error(test_y_pred_xgb, test_y)))
# R2 for train and test data
print("Train R2 score: {:.4f}".format(r2_score(train_y_pred_xgb, train_y)))
print("Test R2 score: {:.4f}".format(r2_score(test_y_pred_xgb, test_y)))
# visualization
plt.figure(figsize=(12,6))
plt.scatter(xgb_df['date'], y, s=50, alpha=0.6, label='original')
plt.scatter(xgb_df['date'], list(train_y_pred_xgb) + list(test_y_pred_xgb), s=50, alpha=0.6, label='predicted')
plt.xlabel('year')
plt.ylabel('number of crimes')
plt.legend(loc='best')
plt.title('Number of crimes per year')
plt.show()
# build a dataframe with date as index and only feature is the number of crimes each day
dl_df = df_date[df_date['year'] >= 2012][['date']]
dl_df['crimes'] = 1
dl_df = dl_df.groupby('date').sum()
dl_df.head()
plt.figure(figsize=(12,6))
plt.plot(dl_df)
plt.xlabel('year')
plt.ylabel('number of crimes')
plt.title('Number of crimes per day')
plt.show()
# rescale the data for faster training
scaler = MinMaxScaler()
data_rescale = scaler.fit_transform(dl_df)
data_rescale = data_rescale.reshape(1,len(dl_df))[0]
# build the final data frame with previous 31 days of data as X and next day data as Y
print(len(data_rescale))
y = [] # to be the number of crimes in a certain day
x = [] # to be the number of crimes in the previous 31 days
for i in range(31, len(data_rescale)):
y.append(data_rescale[i])
x.append(data_rescale[i-31:i])
x = pd.DataFrame(x)
x = x.set_index(dl_df.index[31:])
print(x.shape)
x.head()
# split the train (2012-2014) and test (2015-2016) data
split = len(x[x.index < '2015'])
train_y = np.asarray(y[:split])
train_X = x[:split].values
test_y = np.asarray(y[split:])
test_X = x[split:].values
# check the data shape
print(train_X.shape, len(train_y), test_X.shape, len(test_y))
# reshape to the format of HWC
train_X = np.reshape(train_X, (train_X.shape[0], train_X.shape[1],1))
test_X = np.reshape(test_X, (test_X.shape[0], test_X.shape[1], 1))
# build a LSTM model
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=train_X.shape[1:]))
model.add(Dropout(0.4))
model.add(LSTM(units=50, return_sequences=True))
model.add(Dropout(0.4))
model.add(LSTM(units=50, return_sequences = True))
model.add(Dropout(0.4))
model.add(LSTM(units=50))
model.add(Dropout(0.4))
model.add(Dense(units=1))
model.compile(optimizer='rmsprop', loss='mean_squared_error')
model.summary()
# training
out = model.fit(train_X, train_y, epochs=100, batch_size=32)
train_y_predict = model.predict(train_X)
test_y_predict = model.predict(test_X)
# MSE for train and test data
print("Train MSE: {:.4f}".format(mean_squared_error(train_y_predict, train_y)))
print("Test MSE: {:.4f}".format(mean_squared_error(test_y_predict, test_y)))
# visualize the prediction
predict = np.concatenate((scaler.inverse_transform(train_y_predict), scaler.inverse_transform(test_y_predict)), axis = 0)
output = pd.DataFrame(index=dl_df.index[31:])
output['original'] = scaler.inverse_transform(np.array(y).reshape(1, -1))[0]
output['predicted'] = np.reshape(predict, (1, -1))[0]
plt.figure(figsize=(12,6))
plt.plot(output['original'], label='original')
plt.plot(output['predicted'], label='predicted')
plt.xlabel('year')
plt.ylabel('number of crimes')
plt.legend(loc='best')
plt.title('Number of crimes per day')
plt.show()
# number of crimes per year
eva_df_dl = output.resample('Y').sum()
eva_df_dl['difference (%)'] = 100 * (eva_df_dl['predicted'] - eva_df_dl['original'])/eva_df_dl['original']
eva_df_dl
df_year = dl_df.resample('Y').sum()
df_year
train_y = df_year['crimes'][:3]
train_X = np.array(range(2012, 2015)).reshape(-1,1)
test_y = df_year['crimes'][3:]
test_X = np.array(range(2015, 2017)).reshape(-1,1)
model_lr = LinearRegression()
model_lr.fit(train_X, train_y)
train_y_pred_lr = model_lr.predict(train_X)
test_y_pred_lr = model_lr.predict(test_X)
# R2 for train and test data
print("Train R2 score: {:.10f}".format(r2_score(train_y_pred_lr, train_y)))
print("Test R2 score: {:.10f}".format(r2_score(test_y_pred_lr, test_y)))
# visualization
plt.figure(figsize=(12,6))
plt.scatter(range(2012, 2017), df_year['crimes'], s=100, alpha=0.6, label='original')
plt.scatter(range(2012, 2017), list(train_y_pred_lr) + list(test_y_pred_lr), s=100, alpha=0.6, label='predicted')
plt.xlabel('year')
plt.ylabel('number of crimes')
plt.legend(loc='best')
plt.title('Number of crimes per year')
plt.show()
# number of crimes per year
data = {
'original': df_year['crimes'].values,
'predicted': list(train_y_pred_lr) + list(test_y_pred_lr)
}
eva_df_lr = pd.DataFrame(data=data, index=range(2012, 2017))
eva_df_lr['difference (%)'] = 100 * (eva_df_lr['predicted'] - eva_df_lr['original'])/eva_df_lr['original']
eva_df_lr